Setup

library(limma)
library(edgeR)
library(tximport)
library(AnnotationHub)
library(magrittr)
library(scales)
library(tidyverse)
library(ggrepel)
library(RColorBrewer)
library(fgsea)
library(msigdbr)
library(pander)
library(kableExtra)
library(ggfortify)
library(ggpubr)
library(pheatmap)
library(here)
library(goseq)
library(RUVSeq)
library(harmonicmeanp)
library(ngsReports)
library(UpSetR)
library(gridExtra)
if (interactive()) setwd(here("R"))
panderOptions("table.split.table", Inf)
theme_set(theme_bw())

The alignments in this analysis were generated by Steve. He aligned each library (including technical replicates) to the Zebrafish transcriptome from Ensembl Release 94 (GRCz11) using kallisto (v0.43.1). In addition to the standard transcriptome, the two mutant psen2 transcripts were manually added to the reference.

The corresponding set of gene descriptions were then loaded into R as an EnsDb object using the AnnotationHub() infrastructure. Likewise, the set of transcript descriptions were loaded, with the manual addition of the two novel psen2 mutants.

annotation objects

ah <- AnnotationHub() %>%
    subset(species == "Danio rerio") %>%
    subset(rdataclass == "EnsDb")
ensDb <- ah[["AH64906"]]
genes <- genes(ensDb) 
mcols(genes) <- mcols(genes)[!names(mcols(genes)) %in% c("seq_coord_system", "gene_name")]
transcripts <- transcripts(ensDb)
psen2 <- subset(transcripts, gene_id == "ENSDARG00000015540") %>% 
    granges() %>% 
    magrittr::extract(c(1, 1),) 
names(psen2) <- c("psen2T141_L142delinsMISLISV", "psen2N140fs")
mcols(psen2) <- DataFrame(tx_id = names(psen2),
                                                    tx_biotype = c("protein_coding", "nonsense_mediated_decay"),
                                                    gene_id = "ENSDARG00000015540",
                                                    tx_id_version = names(psen2),
                                                    tx_name = names(psen2))
transcripts %<>%
    GRangesList(psen2) %>%
    unlist() %>%
    sort()

transcripts$symbol <- mcols(genes[transcripts$gene_id])$symbol
# tx2gene <- mcols(transcripts)[c("tx_id_version", "gene_id")] 
tx2gene <- mcols(transcripts)[c("tx_id_version", "symbol")] %>%
    subset(symbol != "")

geneCounts <- list.files(path = here("3_kallisto"), recursive = TRUE, pattern = "h5", full.names = TRUE) %>%
    set_names(basename(dirname(.))) %>%
    set_names(str_replace(names(.), "_-", "WT")) %>%
    tximport(type = "kallisto", tx2gene = tx2gene)

genotype checks

Transcript-level counts were imported using catchKallisto() from edgeR in order to check that expression of the psen2 alleles matches the genotype of the fish which was determined by PCRs performed on genomic DNA from each fish extracted from their tail.

From the plot below, we see that fish 8_FS_4 is actually a transheterozygous fish and should be omitted.

transCounts <- list.files(here("3_kallisto/"), full.names = TRUE) %>%
    catchKallisto()

colnames(transCounts$counts) %<>%
    basename() %>%
    str_replace("_-", "WT")


psen2IDs <- subset(transcripts, symbol == "psen2") %>% names()

transCounts$counts %>%
    cpm() %>% 
    set_rownames(str_remove(rownames(.), "\\.[0-9]+")) %>%
    magrittr::extract(psen2IDs,) %>%
    as.data.frame() %>%
    rownames_to_column("Transcript") %>%
    gather(key = "Sample", value = "CPM", -Transcript) %>%
    mutate(Transcript = str_replace(Transcript, "ENSDART.+", "psen2-WT"), 
                 Genotype = str_extract(Sample, "(WT|FAD|FS)")) %>% 
    ggplot(aes(Sample, CPM, fill = Transcript)) +
    geom_bar(stat = "identity", colour = "black") +
    facet_wrap(~Genotype, nrow = 1, scales = "free_x") +
    scale_fill_viridis_d(option = "viridis") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 
                aspect.ratio = 1) 
*CPM values for each psen2 transcript across all samples.*

CPM values for each psen2 transcript across all samples.

Make the DGE object

minCpm <- 1
minSamples <- 5
dge <- geneCounts$counts %>%
    magrittr::extract(rowSums(cpm(.) > minCpm) >= minSamples,) %>%
    DGEList() %>%
    calcNormFactors()

dge$samples %<>%
    mutate(Sample = rownames(.),
                 Genotype = str_extract(Sample, "(WT|FAD|FS)"),
                 Genotype = factor(Genotype, levels = c("WT", "FS", "FAD")),
                 group = as.integer(Genotype), 
                 Genotype_forPub = case_when(
                    Genotype == "FAD" ~ "EOfAD", 
                    Genotype == "FS" ~ "FS" , 
                    TRUE ~ "WT"
                 ),
                 Genotype_forPub = factor(Genotype_forPub, levels = c("WT", "FS", "EOfAD", "transhet"))) %>%
    set_rownames(.$Sample)

dge$samples["8_FS_4",]$Genotype_forPub <- "transhet"

dge$samples %<>%   
    mutate(Sex = case_when(
                 Sample %in% c("15_WT_1", 
                                            "14_WT_2",  
                                            "13_WT_3", 
                                            "10_FS_1", 
                                            "5_FS_2", 
                                            "9_FS_3",  
                                            "6_FAD_1",
                                            "4_FAD_2",
                                            "3_FAD_3") ~ "Female" ,
                 TRUE ~ "Male")) %>% 
    set_rownames(.$Sample)

Gene-level counts were imported using tximport, mapping transcripts to genes. Some genes exist in the primary assembly and on alternate assemblies for specific regions, and these were considered as separate transcripts of the same gene for read summarisation purposes. Transcript counts were thus mapped to genes using the gene symbol (e.g. psen2), instead of the gene id.

Upon loading into a DGE object, genes which had less than 1 in at least 5 of the samples were omitted. This equated to about 31 reads for a gene in at least 5 samples for inclusion in downstream analysis, giving a total of 18,808 of the original genes for DGE analysis.

library sizes

*Total counts from each library after assigning to genes*

Total counts from each library after assigning to genes

PCA on entire dataset

I next will perform a Principal Component Analysis to visualise the similarity between samples. Note that sample 12_WT_4 appears quite distant from the others. No clear seperation is observed of genotypes or sex. Also PC1 and PC2 only account for ~30% of the total vriation in this dataset, which is lower than one would expect

dge %>% 
    cpm(log = TRUE) %>% 
    t() %>% 
    prcomp() %>% 
    autoplot(
        data = tibble(Sample = rownames(.$x)) %>%
            left_join(dge$samples),
        colour = "Genotype_forPub", 
        size = 4, 
        shape = "Sex",
    ) +
  geom_label_repel(
    aes(label = Sample, colour = Genotype_forPub), show.legend = FALSE
  ) +
    scale_color_manual(values = c("darkcyan",  "firebrick" , "purple4", "grey30"))

MDS plot

Another way of visualising the similarity between samples is a multi-dimensional scaling plot (MDS) plot. The distances between samples can be interpreted as the leading log2-fold-change, so that similar samples will cluster together. Like in the PCA plot, sample 12_WT_4 appears quite distant from the rest, and no seperation between genotypes or sex is observed.

# save this plot as an object for later. 
mds1 <- dge %>% 
    plotMDS(plot = FALSE) %>%
    extract2("cmdscale.out") %>%
    set_colnames(paste0("Dim", 1:2)) %>% 
    as.data.frame() %>% 
    rownames_to_column("Sample") %>% 
    left_join(dge$samples) %>% 
    ggplot(aes(Dim1, Dim2)) +
    geom_point(aes(colour = Genotype_forPub, shape = Sex), size = 4) +
    labs(x = "Dimension 1", 
             y = "Dimension 2", 
             colour = "Genotype") +
    geom_label_repel(aes(label = Sample, colour = Genotype_forPub)) +
    scale_colour_manual(values = c("grey30", "firebrick", "darkcyan", "purple")) +
    theme(aspect.ratio = 1)

mds1

drop samples

I showed before that 8_FS_4 is actually a transhet sample. Therefore, I will omit it from the analysis.

In the PCA and MDS plots above, and in the previous limma analysis by Steve, sample 12_WT_4 was an obvious outlier as it did not cluster with the rest of the samples, and steve showed it was downweighted majorly compared to the others and will also e omitted.

dropSamples <- c("8_FS_4", "12_WT_4")

dge <- dge[,!colnames(dge) %in% dropSamples]

MDS plots

the MDS and PCA plots are now regenerated below

dge %>% 
    plotMDS(plot = FALSE) %>%
    extract2("cmdscale.out") %>%
    set_colnames(paste0("Dim", 1:2)) %>% 
    as.data.frame() %>% 
    rownames_to_column("Sample") %>% 
    left_join(dge$samples) %>% 
    ggplot(aes(Dim1, Dim2)) +
    geom_point(aes(colour = Genotype_forPub, shape = Sex), size = 4) +
    #geom_label_repel(aes(label = Sample, colour = Genotype), show.legend = FALSE) +
    theme_bw() +
    labs(x = "Dimension 1", 
             y = "Dimension 2", 
             colour = "Genotype") + 
    scale_colour_manual(values = c("grey30", "firebrick", "darkcyan" ))

dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp() %>%
    autoplot(
        data = tibble(Sample = rownames(.$x)) %>%
            left_join(dge$samples),
        colour = "Genotype", 
        size = 4, 
        shape = "Sex",
    ) +
  geom_label_repel(
    aes(label = Sample, colour = Genotype),
    show.legend = FALSE
  )  +
    scale_colour_manual(values = c("grey30", "firebrick", "darkcyan" ))

dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp() %>% 
    summary() %>%
    extract2("importance") %>%
    magrittr::extract(,paste0("PC", 1:5)) %>%
    kable(caption = paste("*First five principal components, showing that the first two only account for", percent(.[3,2]), "of the total variance, which is below expectations*")) %>% 
    kable_styling()
First five principal components, showing that the first two only account for 33% of the total variance, which is below expectations
PC1 PC2 PC3 PC4 PC5
Standard deviation 22.38785 18.64065 17.46163 16.30118 14.52113
Proportion of Variance 0.19215 0.13321 0.11689 0.10187 0.08084
Cumulative Proportion 0.19215 0.32536 0.44225 0.54412 0.62495

Hierarchical clustering

Another visualisation here is performed where samples are clustered based on their Euclidean distance.

anno <-
    dge %>% 
  cpm(log = T) %>% 
  t() %>% 
  rownames() %>% 
  as.data.frame() %>% 
  set_colnames("Sample") %>% 
  left_join(dge$samples) %>% 
  dplyr::select(Sample, c(Genotype = Genotype_forPub), Sex) %>% 
  column_to_rownames("Sample")

dge %>% 
  cpm(log = T) %>% 
  t() %>% 
  dist(method = "euclidean") %>% 
  as.matrix() %>% 
  pheatmap(
    color = viridis::viridis_pal(option = "viridis")(100), 
    annotation_col = anno, 
    annotation_colors = list(Genotype = c(FS = "firebrick", 
                                                                                WT = "grey30", 
                                                                                EOfAD = "darkcyan"), 
                                                     Sex = c(Male = "#FF9700", 
                                                                    Female = "#0370FF")),
    show_rownames = F, 
    show_colnames = F, 
    annotation_row = anno, 
    cellwidth = 10, cellheight = 10, 
    treeheight_row = 10, treeheight_col = 10)

DGE analysis using edgeR

Steve previously performed a limma analysis. Here, I will perform a DGE analysis using the GLM and likelihood ratio tests of edgeR, which in our experience, has more power to detect DE genes.

# model matrix
design <- model.matrix(~Genotype, data = dge$samples) %>% 
    set_colnames(gsub("Genotype", "", colnames(.)))

# fit the glm
fit <- dge %>% 
    estimateDisp(design) %>% 
    glmFit(design)

topTables_glm <- design %>% colnames() %>% .[2:3] %>% 
   sapply(function(x){
    glmLRT(fit, coef = x) %>%
      topTags(n = Inf) %>%
      .[["table"]] %>%
      as.data.frame() %>%
        rownames_to_column("symbol") %>% 
      arrange(PValue) %>%
       mutate(
        coef = x,
        DE = FDR < 0.05
      ) %>% 
      dplyr::select(
        symbol, logFC, logCPM, PValue, FDR, DE, everything()  
      ) %>% 
        as_tibble()
      
  }, simplify = FALSE)  

topTables_glm %>% 
    bind_rows() %>% 
    ggplot(aes(logFC, -log10(PValue))) +
    geom_point(aes(colour = DE)) +
    geom_label_repel(aes(label = symbol), 
                                     data = . %>% 
                                        dplyr::filter(DE == T) %>% 
                                        dplyr::slice(1:20)) +
    facet_wrap(~coef, scales = "free_x") +
    scale_colour_manual(values = c("grey50", "red")) +
    theme_bw() +
    theme(legend.position = "bottom")

topTables_glm %>% 
    bind_rows() %>% 
    ggplot(aes(logCPM, logFC)) +
    geom_point(aes(colour = DE)) +
    geom_label_repel(aes(label = symbol), 
                                     data = . %>% 
                                        dplyr::filter(DE == T), 
                                     size = 3, alpha = 0.5) +
    facet_wrap(~coef, ncol = 1) +
    scale_colour_manual(values = c("grey50", "red")) +
    theme_bw()
*MD plot for psen2^N140fs/+^ samples and psen2^T141_L142delinsMISLISV/+^ compared to psen2^+/+^ samples*

MD plot for psen2N140fs/+ samples and psen2T141_L142delinsMISLISV/+ compared to psen2+/+ samples

I want to check for a GC or length bias for differential expression. If such a bias is observed, I can correct for it using cqn. GC and length information is easily available from EnsDb objects from release 98 onwards, so this release was used. No expressed genes were noted as absent from the annotations for Release 98, despite the differences between releases. GC content and Length were taken as weighted averages and simple averages respectively. No discernible bias is observed, supporting that gene set testing can be performed in this dataset.

ens98 <- ah[["AH74989"]]

genes98 <- genes(ens98)

ex <- exonsBy(ens98, "tx")

tr <- transcripts(ens98)

tr$len <- ex %>%
    width %>%
    lapply(sum) %>%
    .[names(tr)] %>%
    unlist()

gnBias <- mcols(tr) %>%
    as.data.frame() %>%
    group_by(gene_id) %>%
    summarise(
        n = n(),
        meanGC = sum(len*gc_content) / sum(len),
        len = mean(len)
    )

mcols(genes) %<>%
    as.data.frame() %>%
    left_join(gnBias) %>%
    DataFrame()

gnGC <- genes %>% 
    mcols() %>% 
    as_tibble() %>% 
    unchop(entrezid) 
topTables_glm %>% 
    lapply(function(x) {
        x %>% 
            left_join(gnGC) %>% 
            as_tibble()
    }) %>% 
    bind_rows() %>% 
    ggplot(aes(meanGC, sign(logFC)*-log10(PValue))) +
    geom_point(aes(colour = DE)) +
    geom_smooth(se = F) +
    facet_wrap(~coef, ncol = 1) +
    scale_colour_manual(values = c("grey50", "red")) +
    theme_bw()

topTables_glm %>% 
    lapply(function(x) {
        x %>% 
            left_join(gnGC) %>% 
            as_tibble()
    }) %>% 
    bind_rows() %>% 
    ggplot(aes(len, sign(logFC)*-log10(PValue))) +
    geom_point(aes(colour = DE)) +
    geom_smooth(se = F) +
    geom_hline(yintercept = 0, linetype = 2) +
    scale_x_log10() +
    facet_wrap(~coef, ncol = 1) +
    scale_colour_manual(values = c("grey50", "red")) +
    theme_bw()

~~ Enrichment analysis ~~

Here, I will perform enichment analysis both within the DE genes using goseq, which allows for a covariate to be a predictor variable, and then within ranked lists using fry (which handles inter-gene correlations better than other methods such as GSEA). The gene sets which will be used are the KEGG and GO gene sets from MSigDB, and the IRE gene sets which were previously defined by Nhi from our lab which contains genes wich contain an iron-responsive element in the untranslated regions.

the KEGG and GO gene sets were imported using the msigdbr package. The GO terms were only included if they had 3 or more steps back to the ontology root terms. The ire gene sets were provided by Nhi as an .rds object which was directly imported.

import the gene sets

ens2Entrez <- genes %>% 
    as_tibble() %>% 
    dplyr::select(symbol, entrezid) %>% 
    unchop(entrezid) %>% 
    dplyr::filter(symbol %in% rownames(dge)) %>% 
    dplyr::filter(!is.na(entrezid))

KEGG <-
    msigdbr("Danio rerio", category = "C2", subcategory = "CP:KEGG") %>%
    dplyr::rename(entrezid = entrez_gene) %>% 
    inner_join(ens2Entrez) %>%
    distinct(gs_name, symbol, .keep_all = TRUE) %>% 
  split(f = .$gs_name) %>%
  lapply(extract2, "symbol")

# obtained the IRE gene sets from Nhi
ireGenes <- readRDS(here("R/zebrafishIreGenes.rds")) %>% 
    lapply(function(x){
        x %>% 
            as.data.frame() %>% 
            set_colnames("gene_id") %>% 
            as_tibble() %>% 
            left_join(genes %>% 
                                    as_tibble() %>% 
                                    dplyr::select(gene_id, symbol)) %>% 
            .$symbol
    })

# GO terms
goSummaries <- url("https://uofabioinformaticshub.github.io/summaries2GO/data/goSummaries.RDS") %>%
  readRDS() %>%
  mutate(
    Term = Term(id),
    gs_name = Term %>% str_to_upper() %>% str_replace_all("[ -]", "_"),
    gs_name = paste0("GO_", gs_name)
    )
minPath <- 3

go <- msigdbr("Danio rerio", category = "C5") %>% 
    dplyr::rename(entrezid = entrez_gene) %>% 
    inner_join(ens2Entrez) %>%
    distinct(gs_name, symbol, .keep_all = TRUE) %>% 
    split(f = .$gs_name) %>%
    lapply(extract2, "symbol")

Enrichment testing within the DE gene sets: goseq

To test within the DE genes for over-representation of these gene sets, I will use goseq. goseq quantifies the probability of a gene being considered as DE based on a single covariate and corrects for it by estimatting a probability weight function (PWF). Here, I will use the average transcript length per gene. Very minimal over-representation is obsevred in both lists of DE genes for each mutation.

pwfs <- topTables_glm %>%
        lapply(function(x) {
        x %>% 
            left_join(gnGC, by = "symbol") %>% 
            as_tibble() %>% 
                distinct(symbol, .keep_all = TRUE)
    }) %>% 
    lapply(function(x) {
        x %>% 
            with(
                nullp(
                    DEgenes = structure(DE, names = symbol), 
                    bias.data = len
                )
            )
    })

goseq(pwfs$FS, gene2cat = c(KEGG, go, ireGenes)) %>% 
    as_tibble() %>% 
    mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>% 
    head(5) %>% 
    dplyr::select(-under_represented_pvalue) %>% 
    kable(caption = "KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation") %>% 
    kable_styling()
KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation
category over_represented_pvalue numDEInCat numInCat FDR
GO_MUSCLE_FILAMENT_SLIDING 1.00e-07 4 27 0.0013886
GO_ACTIN_MEDIATED_CELL_CONTRACTION 2.22e-05 4 94 0.0793438
GO_SKELETAL_MUSCLE_CONTRACTION 2.30e-05 3 31 0.0793438
GO_PHOSPHOTRANSFERASE_ACTIVITY_NITROGENOUS_GROUP_AS_ACCEPTOR 3.42e-05 2 5 0.0884924
GO_ACTIN_FILAMENT_BASED_MOVEMENT 4.71e-05 4 114 0.0973880
goseq(pwfs$FAD, gene2cat = c(KEGG,go, ireGenes)) %>% 
    as_tibble() %>% 
    mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>% 
    head(5) %>% 
    dplyr::select(-under_represented_pvalue) %>% 
    kable(caption = "KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FAD mutation") %>% 
    kable_styling()
KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FAD mutation
category over_represented_pvalue numDEInCat numInCat FDR
GO_RNA_BINDING 0.1133806 1 1420 1
GO_DNA_CATABOLIC_PROCESS_ENDONUCLEOLYTIC 1.0000000 0 26 1
GO_GLUTAMATERGIC_SYNAPSE 1.0000000 0 353 1
GO_ION_TRANSPORT 1.0000000 0 1297 1
GO_POSITIVE_REGULATION_OF_CELL_PROJECTION_ORGANIZATION 1.0000000 0 371 1

Gene set testing on ranked lists (fry)

To obtain a more complete view on the changes to gene expression due to each mutation, I will use fry from the limma package, which can take into account inter-gene correlations. Due to the highly overlapping nature of GO terms, I will only look at the ire and KEGG gene sets.

Using fry, only genes inolved in the Notch signalling pathway is found to be alered in by the FS mutation.

fryRes <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(x) {
    cpm(dge$counts, log = TRUE) %>% 
    fry(
      index = c(KEGG, ireGenes),
      design = design, 
      contrast = x, 
      sort = "mixed"
    ) %>% 
    rownames_to_column("pathway") %>% 
    as_tibble() %>% 
      mutate(coef = x, 
             sig = FDR.Mixed < 0.05)
  }, simplify = FALSE) 

fryRes$FS %>% 
    head(5) %>% 
    kable(caption = "Top 5 most sig altered gene sets found using fry due to heterozygosity for the psen2 N140fs mutation" ) %>% 
    kable_styling()
Top 5 most sig altered gene sets found using fry due to heterozygosity for the psen2 N140fs mutation
pathway NGenes Direction PValue FDR PValue.Mixed FDR.Mixed coef sig
KEGG_NOTCH_SIGNALING_PATHWAY 43 Down 0.2102627 0.9832361 0.0000013 0.0002482 FS TRUE
KEGG_NON_HOMOLOGOUS_END_JOINING 11 Up 0.2363334 0.9832361 0.0521496 0.9750871 FS FALSE
KEGG_TASTE_TRANSDUCTION 21 Down 0.1660423 0.9832361 0.0608465 0.9750871 FS FALSE
KEGG_FATTY_ACID_METABOLISM 37 Up 0.0128414 0.9082459 0.0867987 0.9750871 FS FALSE
KEGG_ARACHIDONIC_ACID_METABOLISM 25 Up 0.1325214 0.9832361 0.0940346 0.9750871 FS FALSE
fryRes$FAD %>% 
    head(5) %>% 
    kable(caption = "Top 5 most sig altered gene sets found using fry due to heterozygosity for the psen2 FAD-like mutation" ) %>% 
    kable_styling()
Top 5 most sig altered gene sets found using fry due to heterozygosity for the psen2 FAD-like mutation
pathway NGenes Direction PValue FDR PValue.Mixed FDR.Mixed coef sig
KEGG_CELL_CYCLE 116 Down 0.0808469 0.9943924 0.0054747 0.6034515 FAD FALSE
KEGG_OOCYTE_MEIOSIS 101 Up 0.8279451 0.9943924 0.0083455 0.6034515 FAD FALSE
KEGG_VIBRIO_CHOLERAE_INFECTION 49 Up 0.4082509 0.9943924 0.0135124 0.6034515 FAD FALSE
KEGG_BASAL_CELL_CARCINOMA 47 Down 0.0175232 0.9943924 0.0152448 0.6034515 FAD FALSE
KEGG_ENDOMETRIAL_CANCER 50 Down 0.1119994 0.9943924 0.0275173 0.6034515 FAD FALSE

~~ RUVSeq to assist with identification of significant gene sets ~~

Since very little conclusions were found about the FAD like mutation, I will use RUVseq to see whether I can detect more differential expression of genes/gene sets. I will use the RUVg method, which uses negative control genes. In this case, it will be the least 10,000 DE genes from the initial differential expression due to psen2 genotype.

RUVneg <- 
  dge %>% 
  estimateDisp(design) %>%
  glmFit(design) %>%
  glmLRT(coef = 2:3) %>% # use both psen2 genotype coefs
  topTags(n = Inf, adjust.method = "fdr", sort.by = "p") %>% 
  .$table %>% 
    as.data.frame() %>% 
    rownames_to_column("symbol") %>% 
    arrange(desc(PValue)) %>% 
    .[1:10000, ] %>% 
    .$symbol
### Perform the RUVg method with 1 factor of unwanted variation removed, 
RUV_k1 <- dge$counts %>% 
  round %>% 
  RUVg(RUVneg, 1)

PCA

A PCA analysis was performed on the RUVSeq normalised counts. Not enough of the total variance was explained by PC1 + PC2. However, some seperation between the FS samples was observed.

pca_RUV_k1 <- RUV_k1$normalizedCounts %>% 
  as.data.frame() %>% 
    cpm(log = T) %>% 
    t() %>% 
    prcomp()

pca_RUV_k1 %>% 
  autoplot(
    data = tibble(Sample = rownames(.$x)) %>%
      left_join(dge$samples),
    colour = "Genotype_forPub", 
    size = 4, 
    shape = "Sex"
  ) +
  geom_label_repel(
    aes(label = Sample, colour = Genotype_forPub),
    show.legend = FALSE
  ) +
    scale_colour_manual(values = c("grey30", "firebrick", "darkcyan" )) +
    theme(aspect.ratio = 1) +
    labs(colour = "Genotype")

    #ggsave("plots/PCA_RUV.png", width = 10, height = 10, units = "cm", dpi = 600)

MDS

The MDS plot shows that the FS samples also seperate from the others.

RUV_k1$normalizedCounts %>% 
    cpm(log=T) %>% 
    plotMDS(plot = FALSE) %>%
    extract2("cmdscale.out") %>%
    set_colnames(paste0("Dimension ", 1:2)) %>% 
    as.data.frame() %>% 
    rownames_to_column("Sample") %>% 
    left_join(dge$samples) %>% 
    ggplot(aes(`Dimension 1`, `Dimension 2`)) +
    geom_point(aes(colour = Genotype_forPub, shape = Sex), size = 4) +
    scale_colour_manual(values = c("grey50", "firebrick", "darkcyan" )) +
    geom_label_repel(aes(label = Sample, colour = Genotype_forPub), show.legend = FALSE) +
    theme(aspect.ratio = 1) +
    labs(colour = "Genotype")

Hierarchical clustering

RUV_k1$normalizedCounts %>% 
  cpm(log = T) %>% 
  t() %>% 
  dist(method = "euclidean") %>% 
  as.matrix() %>% 
  pheatmap(
    color = viridis::viridis_pal(option = "viridis")(100), 
    annotation_col = anno, 
        annotation_colors = list(Genotype = c(FS = "firebrick", 
                                                                                WT = "grey30", 
                                                                                EOfAD = "darkcyan")),
    show_rownames = F, show_colnames = F, 
    annotation_row = anno, 
    cellwidth = 10, cellheight = 10, 
    treeheight_row = 5, treeheight_col = 5)

DE analysis with RUVSeq

I next will perform an additional DGE analysis including the W_1 covariate generated by RUVSeq in the design matrix.

dge$samples %<>% 
    cbind(RUV_k1$W)

# model matrix
design_RUV <- model.matrix(~Genotype + W_1, data = dge$samples)%>% 
    set_colnames(gsub("Genotype", "", colnames(.)))

# fit the glm
fit_RUV <- dge %>% 
    estimateDisp(design_RUV) %>% 
    glmFit(design_RUV)

topTables_glm_RUV <- design_RUV %>% colnames() %>% .[2:3] %>% 
   sapply(function(x){
    glmLRT(fit_RUV, coef = x) %>%
      topTags(n = Inf) %>%
      .[["table"]] %>%
      as.data.frame() %>%
        rownames_to_column("symbol") %>% 
      arrange(PValue) %>%
       mutate(
        coef = x,
        DE = FDR < 0.05
      ) %>% 
      dplyr::select(
        symbol, logFC, logCPM, PValue, FDR, DE, everything()  
      ) %>% 
        as_tibble()
      
  }, simplify = FALSE)  

topTables_glm_RUV %>% 
    bind_rows() %>% 
    ggplot(aes(logFC, -log10(PValue))) +
    geom_point(aes(colour = DE)) +
    facet_wrap(~coef, scales = "free_x") +
    # geom_label_repel(aes(label = symbol), 
    #                                data = . %>% 
    #                                   dplyr::filter(DE == T) %>% 
    #                                   dplyr::slice(1:20)) +
    scale_colour_manual(values = c("grey50", "red")) +
    theme_bw() +
    theme(legend.position = "bottom")

topTables_glm %>% 
    bind_rows() %>% 
    ggplot(aes(logCPM, logFC)) +
    geom_point(aes(colour = DE)) +
    geom_label_repel(aes(label = symbol), 
                                     data = . %>% 
                                        dplyr::filter(DE == T), 
                                     size = 3, alpha = 0.5) +
    facet_wrap(~coef, ncol = 1) +
    scale_colour_manual(values = c("grey50", "red")) +
    theme_bw()
*MD plot for psen2^N140fs/+^ samples and psen2^T141_L142delinsMISLISV/+^ compared to psen2^+/+^ samples*

MD plot for psen2N140fs/+ samples and psen2T141_L142delinsMISLISV/+ compared to psen2+/+ samples

~~ Enrichment analysis with RUVSeq ~~

Within DE lists: goseq

The goseq analysis was repeated. However, no over-representation was observed. The top 5 gene sets in each genotype are shown in the tables below.

pwfs_RUV <- topTables_glm_RUV %>%
        lapply(function(x) {
        x %>% 
            left_join(gnGC, by = "symbol") %>% 
            as_tibble() %>% 
                distinct(symbol, .keep_all = TRUE)
    }) %>% 
    lapply(function(x) {
        x %>% 
            with(
                nullp(
                    DEgenes = structure(DE, names = symbol), 
                    bias.data = len
                )
            )
    })

goseq(pwfs_RUV$FS, gene2cat = c(KEGG, ireGenes, go)) %>% 
    as_tibble() %>% 
    mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>% 
    head(5) %>% 
    dplyr::select(-under_represented_pvalue) %>% 
    kable(caption = "KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation") %>% 
    kable_styling()
KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation
category over_represented_pvalue numDEInCat numInCat FDR
GO_PHOSPHOTRANSFERASE_ACTIVITY_NITROGENOUS_GROUP_AS_ACCEPTOR 0.0000196 2 5 0.2032223
GO_TRNA_3_END_PROCESSING 0.0000703 2 9 0.3635225
GO_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS 0.0009853 3 140 1.0000000
KEGG_ALZHEIMERS_DISEASE 0.0011036 3 145 1.0000000
GO_NCRNA_3_END_PROCESSING 0.0012017 2 36 1.0000000
goseq(pwfs_RUV$FAD, gene2cat = c(KEGG, ireGenes, go)) %>% 
    as_tibble() %>% 
    mutate(FDR = p.adjust(over_represented_pvalue, method = "fdr")) %>% 
    arrange(numDEInCat) %>% 
    head(5) %>% 
    dplyr::select(-under_represented_pvalue) %>% 
    kable(caption = "KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation") %>% 
    kable_styling()
KEGG, GO and IRE gene sets approaching being over-represented in the DE list for the FS mutation
category over_represented_pvalue numDEInCat numInCat FDR
GO_SMALL_RIBOSOMAL_SUBUNIT_RRNA_BINDING 1 0 10 1
KEGG_CALCIUM_SIGNALING_PATHWAY 1 0 143 1
GO_NEGATIVE_REGULATION_OF_MOLECULAR_FUNCTION 1 0 843 1
GO_REGULATION_OF_PROTEIN_MODIFICATION_PROCESS 1 0 1458 1
GO_RESPONSE_TO_LIGHT_STIMULUS 1 0 282 1

All genes: fry

I next will repeat fry to look within the entire list of genes detectable in the experiemnt.

fryRes_RUVk1 <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(x) {
        cpm(dge, log = TRUE) %>% 
    fry(
      index = c(KEGG, ireGenes),
      design = design_RUV, 
      contrast = x, 
      sort = "mixed"
    ) %>% 
    rownames_to_column("pathway") %>% 
    as_tibble() %>% 
      mutate(coef = x, 
             sig = FDR.Mixed < 0.05)
  }, simplify = FALSE) 

fryRes_RUVk1 %>% 
    bind_rows() %>% 
    dplyr::filter(sig == TRUE) %>% 
    dplyr::select(pathway, NGenes, Direction, PValue.Mixed, FDR.Mixed, coef) %>% 
    kable(caption = "Gene sets showing significant changes to expression as a group using fry, at an FDR of 5%") %>% 
    kable_styling()
Gene sets showing significant changes to expression as a group using fry, at an FDR of 5%
pathway NGenes Direction PValue.Mixed FDR.Mixed coef
KEGG_NOTCH_SIGNALING_PATHWAY 43 Down 2.60e-06 0.0005004 FS
KEGG_OOCYTE_MEIOSIS 101 Up 1.09e-05 0.0018510 FAD
KEGG_CELL_CYCLE 116 Down 1.95e-05 0.0018510 FAD

harmonic mean-p value of fry, camera and fgsea

Since only few gene sets are observed to be altered by the FAD-like mutation, I will perform a gene set testing method inspired by the EGSEA method, which performs multiple gene set testing algorithms (here, I will use fry, camera and fgsea), then combine the resulting p-values by calculating the harmonic mean p-value. The HMP does push the average towards smaller p-values. However, it does not produce any smaller p-values that already exist from each of the individual test. It is robust to dependent tests (which in this case, it is), and is thought to be less restrictive than other methods.

cameraRes_RUVk1 <- design %>% colnames() %>% .[2:3] %>% 
  sapply(function(x) {
        cpm(dge, log = TRUE) %>% 
    camera(
      index = c(KEGG, ireGenes),
      design = design_RUV, 
      contrast = x, 
      inter.gene.cor = NA
    ) %>% 
    rownames_to_column("pathway") %>% 
    as_tibble() %>% 
      mutate(coef = x)
  }, simplify = FALSE) 

ranks <- topTables_glm_RUV %>% 
    lapply(function(x) { x %>% 
            mutate(rankstat = sign(logFC)*-log10(PValue)) %>% 
            arrange(desc(rankstat)) %>% 
            dplyr::select(c("symbol", "rankstat")) %>% 
            with(structure(rankstat, names = symbol))
    }) 

# set a seed for a reproducible result. 
set.seed(1)
fgsea_RUVk1 <- ranks %>% 
    sapply(function(x) {
        fgseaMultilevel(stats = x, pathways = c(KEGG, ireGenes))}, 
        simplify = FALSE)

fgsea_RUVk1$FS %<>% 
    mutate(coef = "FS")

fgsea_RUVk1$FAD %<>% 
    mutate(coef = "FAD")

HMP <- fryRes_RUVk1 %>% 
  bind_rows() %>% 
  dplyr::select(pathway, PValue.Mixed, coef) %>% 
  dplyr::rename(fry_p = PValue.Mixed) %>% 
  left_join(cameraRes_RUVk1 %>% 
              bind_rows() %>% 
              dplyr::select(pathway, PValue, coef), 
            by = c("pathway", "coef")) %>% 
  dplyr::rename(camera_p = PValue) %>% 
  left_join(fgsea_RUVk1 %>% 
              bind_rows() %>% 
              dplyr::select(pathway, pval, coef), 
            by = c("pathway", "coef")) %>% 
  dplyr::rename(fgsea_p = pval) %>% 
  bind_rows() %>% 
  nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>% 
  mutate(
    harmonic_p = vapply(p, function(x){
      x <- unlist(x)
      x <- x[!is.na(x)]
      p.hmp(x, L = 3)
    }, numeric(1))
  ) %>% 
  unnest() %>% 
  mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"), 
         sig = harmonic_p_FDR < 0.05) %>% 
  arrange(harmonic_p_FDR)

sigpaths <- HMP %>% 
    dplyr::filter(sig == TRUE) %>% 
    .$pathway

HMP %>% 
    dplyr::filter(pathway %in% sigpaths) %>% 
    ggplot(aes(coef, pathway, size =-log10(harmonic_p))) +
    geom_point(aes(colour = sig)) +
    geom_text(
    aes(label = harmonic_p_FDR %>% signif(1)), 
    size = 2.5, 
    vjust = 2.5
  ) +
  labs(colour = "harmonic mean p < 0.05 \nafter FDR adjustment")+
  ggpubr::theme_pubclean()+
  theme(legend.position = "right")+
  scale_color_manual(values = c("grey30", "turquoise3")) 

nums <- HMP %>% 
    dplyr::select(pathway, coef, harmonic_p_FDR) %>% 
    dplyr::filter(pathway %in% sigpaths) %>% 
    mutate(num = case_when(
        harmonic_p_FDR < 0.05 ~ paste(harmonic_p_FDR %>% signif(2)),
        TRUE ~ "ns"
    )) %>% 
    dplyr::select(pathway, coef, num) %>% 
    spread(key = "coef", value = "num") %>% 
    column_to_rownames("pathway") %>% 
    t()


HMP %>% 
    dplyr::select(pathway, coef, harmonic_p_FDR) %>% 
    dplyr::filter(pathway %in% sigpaths) %>% 
    mutate(harmonic_p_FDR = -log10(harmonic_p_FDR)) %>% 
    spread(key = "coef", value = "harmonic_p_FDR") %>% 
    column_to_rownames("pathway") %>% 
    t() %>% 
    pheatmap(color = viridis_pal(option = "magma")(100), 
                     treeheight_row = 0, treeheight_col = 0, 
                     cellwidth = 30, clustering_method = "median", 
                     cellheight = 30, 
                     display_numbers = nums, 
                     #angle_col = 45,
                     number_color = "white")

Using the HMP method of combining p-values for GSEA, 6 gene sets significantly altered by the FAD mutation, and 6 gene sets significantly altered by the FS mutation. The ribosome gene set is altered by both mutations. However, it is much more significant in the FAD. The AD gene set is only significant in the FAD.

No ire gene sets were found to be significantly altered. However, out of all the ire gene sets, the ire3_all was the most significantly altered in the FAD.

HMP %>% 
    dplyr::filter(pathway %>% grepl(pathway, pattern = "ire")) %>% 
    dplyr::select(-c(fry_p, fgsea_p, camera_p)) %>% 
    kable(caption = "harmonic mean p-value of IRE gene sets") %>% 
    kable_styling()
harmonic mean p-value of IRE gene sets
pathway coef harmonic_p harmonic_p_FDR sig
ire3_all FAD 0.0378574 0.3785742 FALSE
ire3_all FS 0.1993138 0.7012893 FALSE
ire3_hq FS 0.5549262 0.7988750 FALSE
ire5_all FS 0.5187127 0.7988750 FALSE
ire5_hq FS 0.6835777 0.7988750 FALSE
ire3_hq FAD 0.3569070 0.7988750 FALSE
ire5_all FAD 0.6142390 0.7988750 FALSE
ire5_hq FAD 0.7467348 0.7988750 FALSE

exploration of gene sets

Upset plots of leading edge genes.

The plots below show that the signals for the sig gene sets are mostly independent of one another. (As shown by the leading edge genes, which can be thought of as the core genes which drive the enrichment of the gene set. )

fgsea_RUVk1$FS %>% 
    dplyr::filter(pathway %in% c(HMP %>% 
                                                                dplyr::filter(coef == "FS") %>% 
                                                                dplyr::filter(sig == TRUE) %>% .$pathway)) %>% 
    dplyr::select(pathway, leadingEdge) %>% 
    mutate(pathway = str_remove(pathway, "KEGG_"), 
                 pathway = str_replace(pathway, 
                                                            pattern = "TRANSENDOTHELIAL_MIGRATION", 
                                                            replacement = "TRANSENDO_MIGR...")) %>% 
    unnest %>% 
    split(f = .$pathway) %>% 
    lapply(magrittr::extract2,"leadingEdge") %>% 
    fromList() %>% 
    upset(order.by = "freq", 
                nsets = length(.), 
                mb.ratio = c(0.5, 0.5)
                )

fgsea_RUVk1$FAD %>% 
    dplyr::filter(pathway %in% c(HMP %>% 
                                                                dplyr::filter(coef == "FAD") %>% 
                                                                dplyr::filter(sig == TRUE) %>% .$pathway)) %>% 
    dplyr::select(pathway, leadingEdge) %>% 
    mutate(pathway = str_remove(pathway, "KEGG_")) %>% 
    unnest %>% 
    split(f = .$pathway) %>% 
    lapply(magrittr::extract2,"leadingEdge") %>% 
    fromList() %>% 
    upset(order.by = "freq", nsets = length(.), 
                mb.ratio = c(0.5, 0.5))

pheatmaps of logFCs

topTables_glm_RUV %>% 
    bind_rows() %>% 
    dplyr::filter(symbol %in% KEGG$KEGG_RIBOSOME) %>% 
    dplyr::select(symbol, logFC, coef) %>% 
    spread(key = "coef", value = "logFC") %>%
    column_to_rownames("symbol") %>% 
    pheatmap(
        color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
  breaks = c(
    seq(min(.), 0, length.out=ceiling(100/2) + 1), 
    seq(max(.)/100, max(.), length.out=floor(100/2))
    ), 
  treeheight_row = 0, treeheight_col = 0,
  main = "KEGG_RIBOSOME"
    )

topTables_glm_RUV %>% 
    bind_rows() %>% 
    dplyr::filter(symbol %in% KEGG$KEGG_OXIDATIVE_PHOSPHORYLATION) %>% 
    dplyr::select(symbol, logFC, coef) %>% 
    spread(key = "coef", value = "logFC") %>%
    column_to_rownames("symbol") %>% 
    pheatmap(color = colorRampPalette(rev(brewer.pal(n = 7, name =
  "RdBu")))(100),
  treeheight_row = 0, 
  main = "KEGG_OXIDATIVE_PHOSPHORYLATION",
  treeheight_col = 0,
  breaks = c(
    seq(min(.), 0, length.out=ceiling(100/2) + 1), 
    seq(max(.)/100, max(.), length.out=floor(100/2))
    )
    )

session info

sessionInfo() %>% 
    pander()

R version 4.0.2 (2020-06-22)

Platform: x86_64-apple-darwin17.0 (64-bit)

locale: en_AU.UTF-8||en_AU.UTF-8||en_AU.UTF-8||C||en_AU.UTF-8||en_AU.UTF-8

attached base packages: stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: ensembldb(v.2.12.1), AnnotationFilter(v.1.12.0), GenomicFeatures(v.1.40.1), AnnotationDbi(v.1.50.3), gridExtra(v.2.3), UpSetR(v.1.4.0), ngsReports(v.1.4.2), harmonicmeanp(v.3.0), FMStable(v.0.1-2), RUVSeq(v.1.22.0), EDASeq(v.2.22.0), ShortRead(v.1.46.0), GenomicAlignments(v.1.24.0), SummarizedExperiment(v.1.18.2), DelayedArray(v.0.14.1), matrixStats(v.0.56.0), Rsamtools(v.2.4.0), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), Biostrings(v.2.56.0), XVector(v.0.28.0), IRanges(v.2.22.2), S4Vectors(v.0.26.1), BiocParallel(v.1.22.0), Biobase(v.2.48.0), goseq(v.1.40.0), geneLenDataBase(v.1.24.0), BiasedUrn(v.1.07), here(v.0.1), pheatmap(v.1.0.12), ggpubr(v.0.4.0), ggfortify(v.0.4.10), kableExtra(v.1.2.1), pander(v.0.6.3), msigdbr(v.7.1.1), fgsea(v.1.14.0), RColorBrewer(v.1.1-2), ggrepel(v.0.8.2), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.2), purrr(v.0.3.4), readr(v.1.3.1), tidyr(v.1.1.2), tibble(v.3.0.3), ggplot2(v.3.3.2), tidyverse(v.1.3.0), scales(v.1.1.1), magrittr(v.1.5), AnnotationHub(v.2.20.2), BiocFileCache(v.1.12.1), dbplyr(v.1.4.4), BiocGenerics(v.0.34.0), tximport(v.1.16.1), edgeR(v.3.30.3) and limma(v.3.44.3)

loaded via a namespace (and not attached): R.utils(v.2.10.1), tidyselect(v.1.1.0), RSQLite(v.2.2.0), htmlwidgets(v.1.5.1), FactoMineR(v.2.3), grid(v.4.0.2), DESeq(v.1.39.0), munsell(v.0.5.0), statmod(v.1.4.34), DT(v.0.15), withr(v.2.2.0), colorspace(v.1.4-1), highr(v.0.8), knitr(v.1.29), rstudioapi(v.0.11), leaps(v.3.1), ggsignif(v.0.6.0), labeling(v.0.3), GenomeInfoDbData(v.1.2.3), hwriter(v.1.3.2), farver(v.2.0.3), bit64(v.4.0.5), rhdf5(v.2.32.2), rprojroot(v.1.3-2), vctrs(v.0.3.4), generics(v.0.0.2), xfun(v.0.16), R6(v.2.4.1), locfit(v.1.5-9.4), bitops(v.1.0-6), assertthat(v.0.2.1), promises(v.1.1.1), gtable(v.0.3.0), rlang(v.0.4.7), genefilter(v.1.70.0), scatterplot3d(v.0.3-41), splines(v.4.0.2), lazyeval(v.0.2.2), rtracklayer(v.1.48.0), rstatix(v.0.6.0), broom(v.0.7.0), reshape2(v.1.4.4), BiocManager(v.1.30.10), yaml(v.2.2.1), abind(v.1.4-5), modelr(v.0.1.8), backports(v.1.1.9), httpuv(v.1.5.4), tools(v.4.0.2), ellipsis(v.0.3.1), ggdendro(v.0.1.21), plyr(v.1.8.6), Rcpp(v.1.0.5), progress(v.1.2.2), zlibbioc(v.1.34.0), RCurl(v.1.98-1.2), prettyunits(v.1.1.1), openssl(v.1.4.2), viridis(v.0.5.1), zoo(v.1.8-8), cluster(v.2.1.0), haven(v.2.3.1), fs(v.1.5.0), data.table(v.1.13.0), openxlsx(v.4.1.5), reprex(v.0.3.0), truncnorm(v.1.0-8), ProtGenerics(v.1.20.0), aroma.light(v.3.18.0), hms(v.0.5.3), mime(v.0.9), evaluate(v.0.14), xtable(v.1.8-4), XML(v.3.99-0.5), rio(v.0.5.16), jpeg(v.0.1-8.1), readxl(v.1.3.1), compiler(v.4.0.2), biomaRt(v.2.44.1), crayon(v.1.3.4), R.oo(v.1.24.0), htmltools(v.0.5.0), mgcv(v.1.8-33), later(v.1.1.0.1), geneplotter(v.1.66.0), lubridate(v.1.7.9), DBI(v.1.1.0), MASS(v.7.3-52), rappdirs(v.0.3.1), Matrix(v.1.2-18), car(v.3.0-9), cli(v.2.0.2), R.methodsS3(v.1.8.1), pkgconfig(v.2.0.3), flashClust(v.1.01-2), foreign(v.0.8-80), plotly(v.4.9.2.1), xml2(v.1.3.2), annotate(v.1.66.0), webshot(v.0.5.2), rvest(v.0.3.6), digest(v.0.6.25), rmarkdown(v.2.3), cellranger(v.1.1.0), fastmatch(v.1.1-0), curl(v.4.3), shiny(v.1.5.0), lifecycle(v.0.2.0), nlme(v.3.1-149), jsonlite(v.1.7.0), Rhdf5lib(v.1.10.1), carData(v.3.0-4), viridisLite(v.0.3.0), askpass(v.1.1), fansi(v.0.4.1), pillar(v.1.4.6), lattice(v.0.20-41), fastmap(v.1.0.1), httr(v.1.4.2), survival(v.3.2-3), GO.db(v.3.11.4), interactiveDisplayBase(v.1.26.3), glue(v.1.4.2), zip(v.2.1.1), png(v.0.1-7), BiocVersion(v.3.11.1), bit(v.4.0.4), stringi(v.1.4.6), blob(v.1.2.1), latticeExtra(v.0.6-29) and memoise(v.1.1.0)